Load Data

Load Enable Data

library(readxl)
library(tidytable)
## 
## Attaching package: 'tidytable'
## The following objects are masked from 'package:stats':
## 
##     dt, filter, lag
## The following object is masked from 'package:base':
## 
##     %in%
conflicted::conflict_prefer_all('tidytable', quiet = TRUE)
data_folder <- "data"

# load enable data
enable_data <- read_xlsx(file.path(data_folder, "enable_Datensatz_RMR._Erwachsene 1.xlsx"), skip = 2, na=c('NA', 'N/A', ''))

# fix compound values
compounds <- c(
  "acetic acid a", "butyric acid a", "propionic acid a",
  "2-Methylbutyrate a", "hexanoic acid c", "Isobutyrate a",
  "Isovalerate a", "pentanoic acid a",
  "4-Methylvaleric acid a", "Lactic acid a"
)

# for all columns that have a comma in their name, replace commas in the column with a dot
for (column_name in names(enable_data)) {
  if (is.character(enable_data[[column_name]])) {
    enable_data[[column_name]] <-
      sub(",", ".", enable_data[[column_name]], fixed = TRUE)
    # for compounds, replace "<0" with 0 to be able to convert to numeric
    if (column_name %in% compounds) {
      # replace "<0" with 0
      enable_data[[column_name]] <-
        ifelse(enable_data[[column_name]] == "<0" |
                 enable_data[[column_name]] == "< 0", 0, enable_data[[column_name]])
    }
  }
}

# write to csv and read again using fread to get the correct data types
data_csv <- file.path(data_folder, 'enable_data.tsv')
fwrite(enable_data, file = data_csv)
enable_data <- data.table::fread(data_csv, stringsAsFactors = TRUE, na=c('NA', 'N/A', ''))

# add 1 to the weight of the nuernberg site
enable_data[Site == "nuernber", `GEWICHt_SECA, kg` := `GEWICHt_SECA, kg` + 1]

enable_data[Site == "nuernber", Site := "Nuremberg"]
enable_data[Site == "freising", Site := "Freising"]

# remove unnecessary column:
if(all(enable_data[, `Probanden-ID` == Label]))
  invisible(enable_data[, `Probanden-ID` := NULL])

Load Microbiome Data

# load microbiome mapping
microbiome_mapping <- data.table::setDT(read_xlsx(file.path(data_folder, "mapping_file 1.xlsx")))
microbiome_mapping[, sample_clean := data.table::tstrsplit(".", x = `#SampleID`, fixed = TRUE, keep = 2)]
# check uniqueness
stopifnot(!any(microbiome_mapping[, duplicated(sample_clean)]))

# load microbiome data
microbiome_data <- fread(file.path(data_folder, "tax.summary.all 1.tab"))
stripped_names <- unlist(data.table::tstrsplit(".", x = names(microbiome_data), fixed = TRUE, keep = 2))
# check uniqueness
stopifnot(!any(duplicated(stripped_names)))
# clean names
names(microbiome_data) <- stripped_names
# get taxa names
taxa_names <- microbiome_data[, V1]
# transpose data
microbiome_data <- microbiome_data |> data.table::transpose(keep.names = "sample_clean", make.names = "V1")
# merge microbiome data with mapping
microbiome_data[microbiome_mapping, on=.(sample_clean), c("Label", "Cohort"):=.(Code, as.character(Cohort))]
# merge with enable data
invisible(enable_data[microbiome_data, on=.(Label), (c(taxa_names, "Cohort")) := mget(c(taxa_names, "Cohort"))])
# create field for first letter of label, i.e., categorical Age and Site
invisible(enable_data[, Label_group:=substr(Label, 1, 1)])

Add Alpha Diversity

# diversity columns
diversity_columns <- c("Shannon.effective", "Simpson.effective")
# load diversity data
diversity_data <- fread('data/Final table.tab', stringsAsFactors=TRUE) |> data.table::setnames(old='V1', new="Label") |> select(Label, all_of(diversity_columns))

invisible(enable_data[diversity_data, on=.(Label), (c(diversity_columns)) := mget(diversity_columns)])

Set Variable Groups

name_mapping <- c(
  "SEX" = "Sex",
  "Alter, Jahre" = "Age",
  "mean_temp" = "Mean Outdoor Temperature",
  "Datum_V1" = "Date",
  "GROESSE, cm" = "Height",
  "GEWICHt_SECA, kg" = "Weight",
  "FETTMASSE_SECA, kg" = "FM",
  "FFM_SECA,kg" = "FFM",
  "VISZFETT,l" = "Visceral Fat",
  "TAILLENUMFANG,cm" = "Waist",
  "HUEFTUMFANG, cm" = "Hip",
  "Körpertemperatur, °C" = "Body Temperature",
  "RMR.KJ" = "RMR",
  "LEUKOZYTEN/nl" = "Leukocytes",
  "hsCRP, mg/dl" = "hsCRP",
  "INSULIN, µU/ml" = "Insulin",
  "TSH, µU/ml" = "TSH",
  "FT3, pg/ml" = "fT3",
  "Hämoglobin, g/dl" = "Hemoglobin",
  "Hämatokrit, %" = "Hematocrit",
  "MCHC, g/dl" = "MCHC",
  "GOTAST, U/l" = "GOT AST",
  "GPTALT, U/l" = "GPT ALT",
  "GGT, U/l" = "GGT",
  "LDH, U/l" = "LDH",
  "Blutzucker, mg/dl" = "Glucose",
  "Cholesterin, , mg/dl" = "Cholesterol",
  "Triglyzeride, mg/dl" = "Triglycerides",
  "HDLCHOL, mg/dl" = "HDL",
  "LDLCHOL, mg/dl" = "LDL",
  "KREATIN, mg/dl" = "Creatinine",
  "GFRCKDE, mg/min" = "GFR",
  "HARNSTOFF, mg/dl" = "Urea",
  "Harnsäure, mg/dl" = "Uric Acid",
  "EISEN,µg/dl" = "Iron",
  "FERRITIN,ng/ml" = "Ferritin",
  "Blutdruck_systolisch, mmHg" = "BP Systolic",
  "Blutdruck_diastolisch, mmHg" = "BP Diastolic",
  "Herzfrequenz, Schläge pro Minute" = "Heart Rate"
)

data.table::setnames(enable_data, old = names(name_mapping), new = name_mapping)

# groups for taxa
taxa_group <-
  c(
    k = 'kingdom',
    p = 'phylum',
    c = 'class',
    o = 'order',
    f = 'family',
    g = 'genus'
  )[data.table::tstrsplit(taxa_names,
                          split = '__',
                          keep = 1,
                          fixed = TRUE)[[1]]]
# match the columns to certain groups
column_groups <- rbind(data.table::data.table(column = 'Label_group', group = "General information"),
                       fread(file.path(data_folder, 'column_groups.tsv')),
                       data.table::data.table(column = taxa_names, group = paste("microbiome", taxa_group)),
                       data.table::data.table(column = diversity_columns, group = paste("microbiome diversity")))
column_groups[, group:=as.factor(group)]
column_groups[column %in% names(name_mapping), column := name_mapping[column]]

# set general response variable
response_variable <- "RMR"
# Conversion factor kcal to kilojoule
conversion_factor <- 4.184
# define grouping column
grouping_column <- "Label_group"
# get other labelling columns
other_labelling_columns <- c("Label", "Site", "Cohort", "Date")
# set the predictor of the established model
age_predictor <- "Age"
sex_predictor <- "Sex"
weight_predictor <- "Weight"
height_predictor <- "Height"
basic_predictors <- c(age_predictor, sex_predictor, weight_predictor, height_predictor, "FM", "FFM")
# define microbial predictors (family level)
microbiome_predictors <- column_groups[group == 'microbiome family', column]
# # other microbial columns
# other_microbial_columns <- setdiff(taxa_names, microbial_predictors)
temperature_predictor <- "Mean Outdoor Temperature"
# define the general predictors
general_predictors <- setdiff(names(enable_data), c(response_variable, grouping_column, other_labelling_columns, taxa_names))
# make sure all predictors are in the data
stopifnot(all(c(response_variable, grouping_column, other_labelling_columns, taxa_names, basic_predictors, microbiome_predictors, general_predictors) %in% names(enable_data)))

Load Kiel Data

kiel_data <- data.table::setDT(read_xlsx(file.path(data_folder, "REE Datenbank TUM_Januar 2025tf QM.xlsx")))
## New names:
## • `` -> `...12`
## • `?` -> `?...14`
## • `?` -> `?...15`
## • `?` -> `?...16`
## • `?` -> `?...17`
# make new SampleID from ID and zusätzliche Codierung
kiel_data[, Site := "Kiel"]
kiel_data[, Label := paste0(ID, "_", `zusätzliche Codierung`)]
kiel_data[, Sex := factor(ifelse(`Geschlecht` == 1, "männlich", "weiblich"), levels = levels(enable_data$Sex))]
kiel_data[, Age := Alter]
kiel_data[, Height := `Größe (m)` * 100]
kiel_data[, FM := FM_ADP_kg]
kiel_data[, Weight := `Gewicht (kg)`]
kiel_data[, FFM := Weight - FM]
kiel_data[, Date := `Datum`]
kiel_data[, `Mean Outdoor Temperature` := `daily mean T`]
kiel_data[, RMR := `REE (kcal)` * conversion_factor]

kiel_predictors <- setdiff(intersect(names(enable_data), names(kiel_data)), 
                             other_labelling_columns)
# remove response from kiel predictors
kiel_predictors <- setdiff(kiel_predictors, response_variable)

kiel_data[, (names(enable_data)[!names(enable_data) %in% names(kiel_data)]) := NA]
kiel_only <- setdiff(names(kiel_data), names(enable_data))
kiel_data[, (kiel_only) := NULL]
kiel_data <- kiel_data[, names(enable_data), with=FALSE]
all_data <- rbind(enable_data, kiel_data, use.names=TRUE)
all_data[, Site:=factor(Site, levels = c('Freising', 'Nuremberg', 'Kiel'))]

Build Models

Split and Check Data

suppressPackageStartupMessages(library(tidymodels))
tidymodels_prefer()

# number of NA responses
message(paste("Number of NAs in response: ", all_data |> filter(is.na(get(response_variable))) |> nrow()))
## Number of NAs in response:  8
all_data <- all_data |> subset(!is.na(get(response_variable)))

data.table::transpose(all_data[, c(n = .N, lapply(.SD, function(x) paste(round(mean(x, na.rm=TRUE), 2), round(sd(x, na.rm=TRUE), 2), sep = " ± "))), by=Site, .SDcols = c(age_predictor, "FM", "FFM", response_variable)], keep.names = 'Column', make.names = 'Site')
##    Column          Freising         Nuremberg              Kiel
##    <char>            <char>            <char>            <char>
## 1:      N               351               103               361
## 2:    Age      48.2 ± 18.91      78.28 ± 2.86     43.18 ± 15.19
## 3:     FM     23.21 ± 10.15      28.48 ± 7.78      32.4 ± 14.85
## 4:    FFM     53.82 ± 11.45     47.05 ± 10.63     52.38 ± 13.36
## 5:    RMR 6868.22 ± 1218.15 6399.38 ± 1198.19 6886.54 ± 1308.15
# set seed for reproducibility
set.seed(1)
# split the data by Site to have a test sets
enable_split <- group_initial_split(all_data[Site %in% c('Freising', 'Nuremberg')], group = "Site")
train_data <- enable_split |> training()
test_data <- enable_split |> testing()

# ensure that Site == "freising" in the whole train_data and nuremberg_data 
stopifnot(all(train_data$Site == 'Freising'))
stopifnot(all(test_data$Site == 'Nuremberg'))

# Number of samples that have at least one NA value
message(paste("Number of NAs in any variable: ", train_data |>
  filter_all(any_vars(is.na(.))) |>
  nrow()))
## Number of NAs in any variable:  42
# set number of folds and repeats
nfolds <- 5
nrepeats <- 10

set.seed(1102)
repeated_cv_split <- rsample::vfold_cv(train_data, v = nfolds, repeats = nrepeats)

Configure Recipes

# create recipes
general_recipe <-
  recipe(train_data) |>
  update_role(all_of(taxa_names), new_role = "microbiome") |>
  update_role(all_of(grouping_column), new_role = "splitting indicator") |>
  update_role(all_of(other_labelling_columns), new_role = "labels") |>
  update_role(all_of(response_variable), new_role = "outcome") |>
  update_role(all_of(microbiome_predictors), new_role = "predictor") |>
  update_role(all_of(general_predictors), new_role = "predictor") |>
  step_log(all_of(microbiome_predictors), offset = 1) |>
  step_impute_mean(all_numeric_predictors()) |>
  step_zv(all_predictors()) |>
  step_dummy(all_nominal_predictors()) 

normalized_recipe <- general_recipe |>
  step_normalize(all_numeric_predictors())

accessible_predictors <- kiel_predictors[kiel_predictors != temperature_predictor]

general_accessible_pred_recipe <- general_recipe |>
  update_role(all_of(general_predictors), new_role = "old_predictor") |>
  update_role(all_of(microbiome_predictors), new_role = "old_predictor") |>
  update_role(all_of(accessible_predictors), new_role = "predictor")

normalized_accessible_pred_recipe <- general_accessible_pred_recipe |>
  step_normalize(all_numeric_predictors())
  
# recipe list
recipe_list <- list(general = general_recipe,
                    normalized = normalized_recipe,
                    generalAccessPred = general_accessible_pred_recipe,
                    normalizedAccessPred = normalized_accessible_pred_recipe)

Configure Models

my_metrics <- metric_set(rmse, ccc, rsq, rsq_trad, mae)
main_metric <- names(attr(my_metrics, "metrics"))[1]

# linear model
lm_model <- 
  linear_reg() |> 
  set_engine("lm")

# lasso model
lasso_model <- 
  linear_reg(penalty = tune(), mixture = 1) |> 
  set_engine("glmnet", standardize = TRUE)

# random forest
rf_model <-
  rand_forest(trees = tune(), min_n = tune(), mtry = tune()) |> 
  set_engine("ranger", importance = 'permutation') |> 
  set_mode("regression")

# neural network
nnet_model <- 
   mlp(hidden_units = tune(), penalty = tune(), epochs = tune()) |> 
   set_engine("nnet", MaxNWts = 2600) |> 
   set_mode("regression")

# model list
model_list <- list(linear = lm_model,
                   lasso = lasso_model,
                   RF = rf_model,
                   nnet = nnet_model)

# set the same parameters for all penalties
penalty_range <- penalty(range=c(-10, 3))

# set the parameters for the lasso
lasso_params <-
  lasso_model |>
  extract_parameter_set_dials() |>
  update(penalty = penalty_range)

# set the parameters for the RF
rf_params <-
  rf_model |>
  extract_parameter_set_dials() 

# set the parameters for the NN
nnet_params <- 
   nnet_model |> 
   extract_parameter_set_dials() |> 
   update(hidden_units = hidden_units(c(1, 27))) |>
   update(penalty = penalty_range)

params_list <- list(lasso = lasso_params,
                    RF = rf_params,
                    nnet = nnet_params)

Configure Workflows

grid_size <- 100
# set the workflows
workflows <- workflow_set(
      preproc = c(rep(recipe_list['general'], 3),
                  rep(recipe_list['normalized'], 1),
                  rep(recipe_list['generalAccessPred'], 3),
                  rep(recipe_list['normalizedAccessPred'], 1)),
      models = rep(model_list, 2),
      cross = FALSE)
 
# iterate over the wflow_ids, add paramters and generate the mtry parameter dynamically
for (this_wflow_id in workflows$wflow_id){
  model_name <- data.table::tstrsplit(this_wflow_id, '_', fixed = TRUE, keep = 2)[[1]]
  # set the parameters for the RF in question
  params <- params_list[[model_name]]
  if (model_name == "RF"){
    params <- params |> 
    update(mtry = mtry(c(
      1, sum(recipe_list[[sub("_.*", "", this_wflow_id)]]$var_info$role == "predictor")
    ))) 
  }
  workflows <- workflows |> 
    option_add(param_info = params, id = this_wflow_id)
}

Train Models

require(doMC)
## Loading required package: doMC
## Loading required package: foreach
## 
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
## Loading required package: iterators
## Loading required package: parallel
doMC::registerDoMC(cores = min(nrepeats*nfolds, parallel::detectCores() - 2))

repeated_cv_result_file <- "repeated_cv_results.rds"
if (file.exists(repeated_cv_result_file)) {
  print("loading grid results")
  repeated_cv_result <- readRDS(repeated_cv_result_file)
} else {
  print("creating grid results")
  repeated_cv_result <- workflows |>
    workflow_map(
      fn = "tune_grid",
      seed = 1503,
      resamples = repeated_cv_split,
      grid = grid_size,
      verbose = TRUE,
      metrics = my_metrics,
      control = control_grid(
        save_pred = TRUE
      )
    )
  saveRDS(repeated_cv_result, repeated_cv_result_file)
}
## [1] "loading grid results"

Evaluate Models and Select Best

library(ggpubr)
library(ggplot2)
ggplot2::theme_set(ggplot2::theme_bw() + theme(strip.background = element_rect(fill = "white")))
fig_dir <- "figures"
if(!dir.exists(fig_dir)) dir.create(fig_dir, recursive = TRUE)

autoplot(repeated_cv_result, type="wflow_id")

wflow_ids <- repeated_cv_result$wflow_id
wflow_ids_wo_linear <- wflow_ids[!endsWith(wflow_ids, "_linear")]

final_models <- sapply(wflow_ids_wo_linear, function(this_wflow_id) {
  wflow_results <- repeated_cv_result |> 
    extract_workflow_set_result(this_wflow_id)
  # print(autoplot(wflow_results) + labs(title = this_wflow_id))
  specs <- workflows |> extract_spec_parsnip(this_wflow_id)
  tuning_params <- specs |> extract_parameter_set_dials() |> pull(name)
  if (inherits(specs, "linear_reg")) {
    if (all(c("penalty", "mixture") %in% tuning_params))
      best_model <- wflow_results |> select_by_one_std_err(metric = main_metric, desc(penalty), mixture)
    else if ("penalty" %in% tuning_params) # Lasso
      best_model <- wflow_results |> select_by_one_std_err(metric = main_metric, desc(penalty))
    else if ("mixture" %in% tuning_params)
      best_model <- wflow_results |> select_by_one_std_err(metric = main_metric, mixture)
  } else if (inherits(specs, "rand_forest")) {
    best_model <- wflow_results |> select_by_one_std_err(metric = main_metric, trees, mtry, desc(min_n))
  } else if (inherits(specs, "mlp")) {
    best_model <- wflow_results |> select_by_one_std_err(metric = main_metric, epochs, hidden_units, desc(penalty))
  } else {
    stop("Unknown model")
  }
  best_model
  
}, simplify = FALSE)

one_std_err_results <-
  data.table::rbindlist(
    final_models,
    idcol = 'wflow_id',
    fill = TRUE
  ) |>
  select(wflow_id, .config)
best_results <- repeated_cv_result |> 
  rank_results(rank_metric = main_metric, select_best = TRUE) |>
  select(wflow_id, .config) |>
  unique()
results_to_keep <- rbind(one_std_err_results, best_results)

repeated_cv_result_dt <- repeated_cv_result |> rank_results(rank_metric = main_metric) |> semi_join(results_to_keep)
repeated_cv_result_dt[, rank := factor(rank, levels = sort(unique(rank)))]

ggplot(repeated_cv_result_dt, aes(x = rank, y = mean, color = wflow_id)) +
  geom_point() +
  geom_errorbar(aes(ymin = mean - std_err, ymax = mean + std_err), width = .5) +
  facet_wrap(~.metric, scales = 'free')

final_fits <- sapply(names(final_models), function(this_wflow_id)
  repeated_cv_result |>
         extract_workflow(this_wflow_id) |>
         finalize_workflow(final_models[[this_wflow_id]]) |>
         last_fit(split = enable_split, metrics=my_metrics),
  simplify = FALSE)

linear_models <- sapply(wflow_ids[!wflow_ids %in% wflow_ids_wo_linear], function(this_wflow_id){
  this_recipe_name <- sub(pattern = '_.*$', replacement = '', this_wflow_id)
  workflow() |> 
    add_recipe(recipe_list[[this_recipe_name]]) |>
    add_model(lm_model) |> 
    last_fit(split = enable_split, metrics = my_metrics)
}, simplify = FALSE)
fits <- c(final_fits, linear_models)
repeated_cv_result_dt <- repeated_cv_result_dt |> semi_join(one_std_err_results |> rbind(best_results |> subset(endsWith(wflow_id, "linear"))))
main_model <- repeated_cv_result_dt[.metric == main_metric][which.min(mean), wflow_id]

Evaluate Lasso Feature Robustness

library(glmnet)

feature_colors <- c(
  "FFM" = "#009999",          # Teal (greenish for FFM_SECA)
  "Weight" = "#B2DF8A",    # Light Green (for GEWICHt_SECA)
  "Mean Outdoor Temperature" = "#000000",           # Black (for mean_temp)
  "GFR" = "#ff6db6",     # Pink (blood measure GFRCKDE)
  "fT3" = "#b66dff",          # Purple (blood measure fT3)
  "MCHC" = "#490092",          # Dark Purple (blood measure MCHC)
  "Unexplained" = "#db6d00",         # Orange (for Unexplained)
  "Intra-Individual Variation" = "#FDB462" # Light Orange (for Individual Variation)
)

stability_dt <- data.table::rbindlist(sapply(recipe_list[c("general", "generalAccessPred")], 
                                             FUN = function(this_recipe){
  # prepare x and y data
  prepped_training_data <- this_recipe |> step_select(c(all_predictors(), all_outcomes())) |> prep() |> bake(NULL)
  x <- prepped_training_data |> select(-any_of(response_variable)) |> as.matrix()
  y <- prepped_training_data |> pull(response_variable)
  
  # get the repeats from our repeated split
  repeat_ids <- repeated_cv_split$id |> unique()
  
  # lasso runs per repeat
  cvfits <- 
    lapply(repeat_ids, function(repeat_id) {
    # Filter the splits for the specified repeat
    repeat_splits <- repeated_cv_split |> subset(id == repeat_id)
    
    # Initialize a fold ID vector (length = number of rows in the data)
    fold_ids <- integer(prepped_training_data |> nrow())
    
    # Assign fold numbers to each observation based on assessment sets
    for (i in seq_along(repeat_splits$splits)) {
      fold <- repeat_splits$splits[[i]]
      assessment_indices <- tidy(fold) |> subset(Data == "Assessment") |> pull(Row)
      fold_ids[assessment_indices] <- i  # Assign fold number `i` to assessment indices
    }
    stopifnot(all(fold_ids>0))
    
    lasso <-
      cv.glmnet(
        x = x,
        y = y,
        alpha = 1,
        family = "gaussian",
        foldid = fold_ids,
        standardize = TRUE
      )
    
    lasso
  }) 
  names(cvfits) <- repeat_ids
  
  nonzero_coefs <- lapply(cvfits, function(cvfit) {
    # get the nonzero coefficients
    coefs <- coef(cvfit, s = "lambda.1se")
    nonzero_coefs <- coefs[coefs[, 1] != 0, ]
    sort(nonzero_coefs, decreasing = TRUE)
    names(nonzero_coefs)
    })
  names(nonzero_coefs) <- repeat_ids
  
  transposed_df <- stack(nonzero_coefs)
  # remove intercept from values
  transposed_df <- data.table::setDT(transposed_df) |> subset(values != "(Intercept)")
  transposed_df
}, simplify = FALSE), idcol = 'recipe')

stability_dt[recipe == "general", Features := "Enable Features"]
stability_dt[recipe == "generalAccessPred", Features := "Accessible Features"]
stability_dt[, Features := factor(Features, levels = c("Enable Features", "Accessible Features"))]

ggplot(stability_dt, aes(x = reorder(values, ind, function(x) -length(x)), fill = values)) +
    geom_bar() +
    scale_fill_manual(values = feature_colors) +
    facet_grid(Features ~.) +
    theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = 'none') +
    labs(title = "Lasso Feature Robustness", x = "Feature", y = "Number of Times Selected")

ggsave(file.path(fig_dir, "lasso_feature_robustness_both.pdf"), width = 6, height = 5)

Evaluate Best Lasso Coefficients

# Function to calculate luminance
calculate_luminance <- function(hex_color) {
  rgb <- col2rgb(hex_color) / 255
  # Apply relative luminance formula
  luminance <- 0.2126 * rgb[1, ] + 0.7152 * rgb[2, ] + 0.0722 * rgb[3, ]
  return(luminance)
}

# Decide text color (black or white) based on luminance threshold
text_colors <- sapply(feature_colors, function(color) {
  if (calculate_luminance(color) > 0.5) {
    "black"  # Light background -> Black text
  } else {
    "white"  # Dark background -> White text
  }
})
selected_coefs <- sapply(names(fits)[endsWith(names(fits), 'lasso')], function(wflow_id){
  selected_coefs <- fits[[wflow_id]] |> 
    extract_fit_parsnip() |> tidy() |> subset(estimate != 0) |> 
    mutate(abs_estimate = abs(estimate)) |> arrange(desc(abs_estimate)) |> select(-abs_estimate)
}, simplify = FALSE)
print(sub(pattern = "x (Intercept) ", replacement = "", x = selected_coefs$general_lasso[, paste(round(estimate, 1), term, sep = " x ", collapse = " + ")], fixed = TRUE))
## [1] "1520.2 + 92.3 x fT3 + 67.5 x FFM + 17.2 x MCHC + -14 x Mean Outdoor Temperature + 9.7 x Weight + 2.4 x GFR"
print(sub(pattern = "x (Intercept) ", replacement = "", x = selected_coefs$generalAccessPred_lasso[, paste(round(estimate, 1), term, sep = " x ", collapse = " + ")], fixed = TRUE))
## [1] "2369.6 + 69.2 x FFM + 13.8 x Weight + -6 x Age"
selected_coefs
## $general_lasso
## # A tidytable: 7 × 3
##   term                     estimate penalty
##   <chr>                       <dbl>   <dbl>
## 1 (Intercept)               1520.      129.
## 2 fT3                         92.3     129.
## 3 FFM                         67.5     129.
## 4 MCHC                        17.2     129.
## 5 Mean Outdoor Temperature   -14.0     129.
## 6 Weight                       9.70    129.
## 7 GFR                          2.36    129.
## 
## $generalAccessPred_lasso
## # A tidytable: 4 × 3
##   term        estimate penalty
##   <chr>          <dbl>   <dbl>
## 1 (Intercept)  2370.      60.5
## 2 FFM            69.2     60.5
## 3 Weight         13.8     60.5
## 4 Age            -6.03    60.5

Build More Complex Linear Models

selected_features <- selected_coefs$general_lasso |> subset(term != '(Intercept)') |> pull(term)
inaccessible_features <- selected_features[!selected_features %in% accessible_predictors]

# now let's make another recipe using only those features
selected_recipe <- general_recipe |>
  update_role(all_of(general_predictors), new_role = "old_predictor") |>
  update_role(all_of(microbiome_predictors), new_role = "old_predictor") |>
  update_role(all_of(selected_features), new_role = "predictor")

quadratic_recipe <- selected_recipe |>
  step_poly(all_numeric_predictors(), degree = 2) 

interaction_recipe <- selected_recipe |>
  step_interact(~ all_predictors():all_predictors())

selectedAccessible_recipe <- selected_recipe |>
  update_role(all_of(inaccessible_features), new_role = "old_predictor")

selectedWTempAccessible_recipe <- selectedAccessible_recipe |>
  update_role(all_of(temperature_predictor), new_role = "predictor")

new_recipes <- list(selected = selected_recipe, 
                    quadratic = quadratic_recipe, 
                    interaction = interaction_recipe,
                    selectedAccess = selectedAccessible_recipe,
                    selectedWTempAccess = selectedWTempAccessible_recipe
                    )

new_recipe_repeated_cv_results <- workflow_set(
      preproc = new_recipes,
      models = list(linear = lm_model)) |>
      workflow_map(
        fn = "fit_resamples",
        seed = 1503,
        resamples = repeated_cv_split,
        verbose = TRUE,
        metrics = my_metrics
      ) |> rank_results()
## i 1 of 5 resampling: selected_linear
## ✔ 1 of 5 resampling: selected_linear (2.5s)
## i 2 of 5 resampling: quadratic_linear
## ✔ 2 of 5 resampling: quadratic_linear (2.6s)
## i 3 of 5 resampling: interaction_linear
## ✔ 3 of 5 resampling: interaction_linear (2.6s)
## i 4 of 5 resampling: selectedAccess_linear
## ✔ 4 of 5 resampling: selectedAccess_linear (2.6s)
## i 5 of 5 resampling: selectedWTempAccess_linear
## ✔ 5 of 5 resampling: selectedWTempAccess_linear (2.6s)
# ggplot(new_recipe_repeated_cv_results, aes(x = rank, y = mean, color = wflow_id)) +
#     geom_point() +
#     geom_errorbar(aes(ymin = mean - std_err, ymax = mean + std_err), width = .5) +
#     facet_wrap(~.metric, scales = 'free')

repeated_cv_result_dt <- rbind(repeated_cv_result_dt, new_recipe_repeated_cv_results)
# rerank the results based on the main metric
repeated_cv_result_dt[, wflow_id2 := factor(wflow_id, levels = repeated_cv_result_dt[.metric == main_metric][order(mean), wflow_id])]
# ggplot(repeated_cv_result_dt, aes(x = wflow_id2, y = mean, color = model)) +
#     geom_point() +
#     geom_errorbar(aes(ymin = mean - std_err, ymax = mean + std_err), width = .5) +
#     facet_wrap(~.metric, scales = 'free') +
#   # rotate x-axis labels 90 degree
#   theme(axis.text.x = element_text(angle = 45, hjust = 1))

recipe_list <- c(recipe_list, new_recipes)

new_models <- sapply(new_recipes, 
       function(this_recipe){
         workflow() |> 
           add_recipe(this_recipe) |>
           add_model(lm_model) |>
           last_fit(split = enable_split, metrics = my_metrics)
}, simplify = FALSE)
names(new_models) <- paste(names(new_models), 'linear', sep = "_")
fits <- c(fits, new_models)

Current Methods in Literature

established_models <- c(`Harris-Benedict` = 'Harris-Benedict', WHO = 'WHO', `Kleiber` = 'Kleiber')

HarrisBenedict <- function(weight, height, age, sex_female){
  ifelse(sex_female, 
         655.1 + 9.6*weight + 1.8*height - 4.7*age,
         66.47 + 13.7*weight + 5*height - 6.8*age) * conversion_factor
}

Kleiber <- function(weight){
  283 * weight^0.75
}

WHO <- function(weight, age, sex_female) {
  
  # Initialize RMR vector
  RMR <- numeric(length(weight))
  
  # Define reusable age filters
  age_filter <- list(
    age_3_10 = age <= 10,
    age_10_30 = age >= 18 & age <= 30,
    age_30_60 = age > 30 & age <= 60,
    age_60_plus = age > 60
  )
  
  # Calculate RMR for each age group using `ifelse` conditioned on `sex_female`
  RMR[age_filter$age_3_10] <- ifelse(sex_female[age_filter$age_3_10],
                                     (22.5 * weight[age_filter$age_3_10] + 499),
                                     (22.7 * weight[age_filter$age_3_10] + 495))
  
  RMR[age_filter$age_10_30] <- ifelse(sex_female[age_filter$age_10_30],
                                      (14.7 * weight[age_filter$age_10_30] + 496),
                                      (15.3 * weight[age_filter$age_10_30] + 679))
  
  RMR[age_filter$age_30_60] <- ifelse(sex_female[age_filter$age_30_60],
                                      (8.7 * weight[age_filter$age_30_60] + 829),
                                      (11.6 * weight[age_filter$age_30_60] + 879))
  
  RMR[age_filter$age_60_plus] <- ifelse(sex_female[age_filter$age_60_plus],
                                        (10.5 * weight[age_filter$age_60_plus] + 596),
                                        (13.5 * weight[age_filter$age_60_plus] + 487))
  
  return(RMR*conversion_factor)
}

Visualize Interesting Models

interesting_models <- c(`Lasso Enable` = "general_lasso", 
                        `Lasso Accessible` = "generalAccessPred_lasso",
                        established_models
                        )

# Swap names and values
interesting_modelmap <- setNames(names(interesting_models), interesting_models)

repeated_cv_result_dt[wflow_id %in% interesting_models, wflow_id := interesting_modelmap[wflow_id]]

# get the fits for the interesting models
interesting_fits <- fits[interesting_models]
# remove NULLs
interesting_fits <- interesting_fits[!sapply(interesting_fits, is.null)]

interesting_model_estimates <- data.table::rbindlist(lapply(interesting_fits, function(fit) fit |> extract_fit_parsnip() |> tidy() |> subset(estimate != 0) |> mutate(abs_estimate = abs(estimate)) |> arrange(desc(abs_estimate)) |> select(-abs_estimate)), idcol = 'model', fill = TRUE)

interesting_model_estimates[, model := interesting_modelmap[model]]
interesting_model_estimates[, term := gsub(pattern = '`', replacement = '', x = term, fixed = TRUE)]

print(interesting_model_estimates[term != '(Intercept)', paste(.SD[order(-abs(estimate)), term], collapse = "; "), by = model])
##               model                                                    V1
##              <char>                                                <char>
## 1:     Lasso Enable fT3; FFM; MCHC; Mean Outdoor Temperature; Weight; GFR
## 2: Lasso Accessible                                      FFM; Weight; Age

Compute Predictions and Metrics

interesting_metrics <- c(RMSE = 'rmse', R2 = 'rsq_trad')
rounding_by <- c(
  RMSE = 0,
  R2 = 3
)
interesting_metricmap <- setNames(names(interesting_metrics), interesting_metrics)


predict_data_fits <- function(fits, recipe_list, truth, new_data = NULL) {
  data.table::rbindlist(
    sapply(names(fits), function(this_wflow_id) {
      last_fit <- fits[[this_wflow_id]]
      this_recipe_name <- sub(pattern = '_.*$', replacement = '', this_wflow_id)
      this_recipe <- recipe_list[[this_recipe_name]]
      prepped_data <- this_recipe |> step_select(all_of(c(all_predictors(), all_outcomes()))) |> prep() |> bake(new_data = new_data)
      last_fit |> extract_fit_parsnip() |> predict(new_data = prepped_data) |> mutate(truth = truth)
    }, simplify = FALSE),
    idcol = 'model'
  ) |> rename(estimate = .pred)
}

predict_data_established <- function(new_data, truth){
  data.table::rbindlist(list(
    data.table::data.table(model = established_models['Harris-Benedict'], 
                           estimate = HarrisBenedict(weight = new_data[[weight_predictor]],
                                                     height = new_data[[height_predictor]],
                                                     age = new_data[[age_predictor]],
                                                     new_data[[sex_predictor]] == 'weiblich'),
                           truth = truth),
    data.table::data.table(model = established_models['Kleiber'], 
                           estimate = Kleiber(weight = new_data[[weight_predictor]]),
                           truth = truth),
    data.table::data.table(model = established_models['WHO'], 
                           estimate = WHO(weight = new_data[[weight_predictor]], 
                                          age = new_data[[age_predictor]],
                                          sex_female = new_data[[sex_predictor]] == 'weiblich'),
                           truth = truth)
    )
  )
}

compute_my_metrics <- function(preds, my_metrics, metrics_to_keep, n_bootstrap_samples = 10000, seed = 4735684){
  metric_dt <- preds[, my_metrics(.SD, truth = truth, estimate = estimate), by=model]
  if (n_bootstrap_samples > 0) {
    set.seed(seed)
    # # data.table only solution is too slow
    # # Get sizes per model
    # model_sizes <- preds[ , .N, by = model]
    # # Create bootstrap sample indices
    # boot_dt <- model_sizes[ , .(
    #   bootstrap_ind = rep(1:n_bootstrap_samples, each = N),
    #   row_id = sample(seq.int(N), size = N * n_bootstrap_samples, replace = TRUE)
    # ), by = model]
    # # Now compute metrics per bootstrap sample
    # bootstrap_metrics <- boot_dt[preds, on = .(model, row_id)][ , my_metrics(.SD, truth = truth, estimate = estimate), by = .(bootstrap_ind, model)]
    
    bootstrap_indices <- replicate(n_bootstrap_samples, sample(preds[ , .N, by = model][, unique(N)], replace = TRUE), simplify = FALSE)
    bootstrap_metrics <- data.table::rbindlist(pbmcapply::pbmclapply(bootstrap_indices, function(indices) {
      preds[ , my_metrics(.SD[indices], truth = truth, estimate = estimate), by=model]
    }, mc.cores = min(n_bootstrap_samples/10, parallel::detectCores() / 2)), idcol = 'bootstrap_ind')
    metric_dt <- rbind(metric_dt, bootstrap_metrics, use.names = TRUE, fill = TRUE)
    # metric_int <- bootstrap_metrics[, .(lower = quantile(.estimate, 0 + ((1 - .9)/2)), mean = mean(.estimate), higher = quantile(.estimate, 1 - ((1 - .9)/2))), by = .(model, .metric)]
    # metric_dt[metric_int, on = .NATURAL, c("lower", "mean", "higher") := .(lower, mean, higher)]
  }
  # metric_dt <- data.table::dcast(metric_dt, model ~ .metric, value.var = '.estimate')
  # data.table::setnames(metric_dt, old = metrics_to_keep, new = names(metrics_to_keep))
  # metric_dt[, metrics_string := do.call(
  #     paste,
  #     c(Map(function(name, value) paste(name, round(value, rounding_by[name]), sep = ": "), names(metrics_to_keep), mget(names(metrics_to_keep))), sep = "\n ")
  # )]
  metric_dt
}
data_by_site <- split(all_data, all_data$Site)
data_by_site$KielWoTemp <- data.table::copy(data_by_site$Kiel)
data_by_site$KielWoTemp[, (temperature_predictor) := NA]

bmi_dt <- all_data[, .(Site, response_check=get(response_variable), BMI = Weight/((Height/100)^2))]

preds_by_site <- sapply(names(data_by_site), function(this_data_name){
  this_data <- data_by_site[[this_data_name]]
  
  fit_preds <- predict_data_fits(fits = fits, 
                                 recipe_list = recipe_list, 
                                 truth = this_data[[response_variable]], 
                                 new_data = this_data)
  fit_preds <- rbind(
    fit_preds,
    predict_data_established(new_data = this_data, truth = this_data[[response_variable]])
  )
  # match the bmi to the predictions
  bmi_site <- bmi_dt[, unique(Site)[sapply(as.character(unique(Site)), function(site) startsWith(this_data_name, site))]]
  fit_preds[, c("BMI", "response_check") := bmi_dt[Site == bmi_site, .(BMI, response_check)], by = 'model']
  stopifnot(fit_preds[, identical(truth, response_check)])
  fit_preds[, response_check := NULL]
  
  fit_preds
}, simplify = FALSE)

metrics_by_site <- lapply(preds_by_site, compute_my_metrics, my_metrics = my_metrics, metrics_to_keep = interesting_metrics)

pred_dt <- data.table::rbindlist(preds_by_site, idcol = 'Site')
metric_dt <- data.table::rbindlist(metrics_by_site, idcol = 'Site')

metric_dt[.metric %in% interesting_metrics, .metric := interesting_metricmap[.metric]]

pred_dt[, Site := factor(Site, levels = c(all_data[, levels(Site)], "KielWoTemp"))]
metric_dt[, Site := factor(Site, levels = c(all_data[, levels(Site)], "KielWoTemp"))]

pred_dt[model %in% interesting_models, model := interesting_modelmap[model]]
metric_dt[model %in% interesting_models, model := interesting_modelmap[model]]

# sort pred_dt and metric_dt 
pred_dt[, model := factor(model, levels = c(interesting_modelmap, unique(model[!model %in% interesting_modelmap])))]
metric_dt[, model := factor(model, levels = c(interesting_modelmap, unique(model[!model %in% interesting_modelmap])))]


pred_dt[grep("lasso", model, ignore.case = TRUE), `Model Type` := "Lasso"]
pred_dt[model %in% established_models, `Model Type` := "Established"]

Plots

Summary

CV Summary

model_type_colors <- c(Established = "#66C2A5",
                       Lasso = "#FC8D62",
                       Nnet = "#8DA0CB", 
                       RF = "#E78AC3",
                       Linear = "#A6D854")

repeated_cv_result_dt[, c("prepping", "Model Type") := data.table::tstrsplit(as.character(wflow_id2), "_", fixed = TRUE)]
repeated_cv_result_dt[, `Model Type` := tools::toTitleCase(`Model Type`)]
repeated_cv_result_dt[, `Accessible Predictors` := grepl(pattern = "Access", x = prepping)]

ggplot(repeated_cv_result_dt[!wflow_id2 %in% names(new_models) & .metric == main_metric], aes(x = reorder(wflow_id, mean), y = mean, color = `Model Type`, shape = `Accessible Predictors`)) +
    geom_point(size = 3) +
    geom_errorbar(aes(ymin = mean - std_err, ymax = mean + std_err), width = .5) +
    scale_shape_manual(values = c(5, 16)) +
    scale_color_manual(values = model_type_colors) +
    theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(),
          legend.position = c(.01, .99),
          legend.justification = c("left", "top"),
          legend.box.background = element_rect(linewidth = .1),
          legend.background = element_rect(fill = NA))+
    labs(x = NULL, y = interesting_metricmap[main_metric], title = "Cross-Validation Summary")
## Warning: A numeric `legend.position` argument in `theme()` was deprecated in ggplot2
## 3.5.0.
## ℹ Please use the `legend.position.inside` argument of `theme()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

ggsave(filename = file.path(fig_dir, "cv_summary.pdf"), width = 5, height = 4)

Test Summary

require(tidytext)
## Loading required package: tidytext
summary_metrics <- data.table::copy(metric_dt)
summary_metrics[Site == "KielWoTemp", Site := "Kiel Without Temperature"]
data.table::setnames(summary_metrics, c(".metric", ".estimate"), c("metric_name", "metric"))
summary_metrics[grep("linear", model, ignore.case = TRUE), `Model Type` := "Linear"]
summary_metrics[grep("lasso", model, ignore.case = TRUE), `Model Type` := "Lasso"]
summary_metrics[model %in% established_models, `Model Type` := "Established"]

wide_bootstrap_res <- data.table::dcast(summary_metrics, formula = Site + metric_name + bootstrap_ind ~ model, value.var = "metric")
pairwise_comparisons <- data.table::CJ(model1 = ordered(interesting_modelmap, levels = interesting_modelmap), 
                                       model2 = ordered(interesting_modelmap, levels = interesting_modelmap))[model1 < model2]
pairwise_tests <- wide_bootstrap_res[, 
                                     .(list = lapply(1:nrow(pairwise_comparisons), function(i){
                                       m1 <- as.character(pairwise_comparisons[i, model1])
                                       m2 <- as.character(pairwise_comparisons[i, model2])
                                       delta <- get(m1) - get(m2)
                                      
                                      # Observed difference
                                      delta_obs <- delta[is.na(bootstrap_ind)]
                                      
                                      # Bootstrap differences centered at their bootstrap mean
                                      delta_star <- delta[!is.na(bootstrap_ind)]
                                      delta_star_centered <- delta_star - mean(delta_star)
                                      
                                      # p-value: proportion of bootstrap differences >= observed difference
                                      p_value <- mean(abs(delta_star_centered) >= abs(delta_obs))
                                      
                                       .(.y. = 'metric', 
                                         p = p_value, 
                                         mean_delta = mean(delta), 
                                         group1 = m1, 
                                         group2 = m2)
                                     })), by = .(Site, metric_name)][, data.table::rbindlist(list), by = .(Site, metric_name)]


# interesting_comparisons <- data.table(group1 = interesting_modelmap[-length(interesting_modelmap)], group2 = interesting_modelmap[-1])

pairwise_tests[, group1 := paste(group1, Site, sep = "___")]
pairwise_tests[, group2 := paste(group2, Site, sep = "___")]

pairwise_tests_summary <- pairwise_tests[metric_name %in% interesting_metricmap &
                                           Site %in% c("Freising", "Nuremberg", "Kiel")]
pairwise_tests_summary[, p.adj :=  p.adjust(p, method = "BH"), by = .(Site, metric_name)]
pairwise_tests_summary[, p.signif := as.character(stats::symnum(x = p.adj, 
                                       cutpoints = c(0, 1e-04, 0.001, 0.01, 0.05, 1), 
                                       symbols = c("****", "***", "**", "*", "ns")))]

score_summary_metrics <- summary_metrics[model %in% interesting_modelmap &
                                        metric_name %in% interesting_metricmap &
                                        Site %in% c("Freising", "Nuremberg", "Kiel")]

pairwise_tests_summary[score_summary_metrics[!is.na(bootstrap_ind), 
                                     .(y.position = max(metric)), 
                                     by = .(Site, metric_name)],
               on = .NATURAL,
               y.position := y.position]
pairwise_tests_summary[, y.position := y.position + y.position * (.05 * (seq.int(.N) - 1)), by = .(Site, metric_name)]
pairwise_tests_summary[metric_name == "RMSE", y.position := y.position - 200]


score_summary <- ggplot(score_summary_metrics[!is.na(bootstrap_ind)],
       aes(x = tidytext::reorder_within(model, metric, Site), y = metric)) +
    geom_col(data = score_summary_metrics[is.na(bootstrap_ind)], aes(fill = `Model Type`), color = 'white', alpha = 0.7) +
    stat_summary(fun.data = function(x) list(y = mean(x), ymin = quantile(x, 0 + ((1 - .9)/2)), ymax = quantile(x, 1 - ((1 - .9)/2))), 
                 geom = "pointrange", fatten = 2) +
    stat_pvalue_manual(pairwise_tests_summary, 
                       label = "p.signif", tip.length = 0, size = 2) +
    scale_fill_manual(values = model_type_colors) +
    coord_cartesian(ylim = c(0, NA)) +
    # add the value as text into the bar, ninety degrees rotated
    geom_text(data = score_summary_metrics[is.na(bootstrap_ind)], aes(label = round(metric, rounding_by[metric_name]), y = 0), hjust = -.15, vjust = 1.4, angle = 90) +
    facet_grid(metric_name ~ Site, scales = "free") +
    labs(x = NULL, y = NULL) +
    tidytext::scale_x_reordered() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "left")
ggsave(filename = file.path(fig_dir, "score_summary.pdf"), plot = score_summary, width = 8, height = 6)

score_summary_metrics_kiel <- summary_metrics[model %in% interesting_modelmap[!interesting_modelmap %in% established_models] & 
                                                metric_name %in% interesting_metricmap &
                                                Site %in% c("Kiel", "Kiel Without Temperature")]

pairwise_tests_summary_kiel <- 
  pairwise_tests[grepl(pattern = paste(interesting_modelmap[!interesting_modelmap %in% established_models], collapse = "|"), x = group1) &
                 grepl(pattern = paste(interesting_modelmap[!interesting_modelmap %in% established_models], collapse = "|"), x = group2) &
                 metric_name %in% interesting_metricmap &
                Site %in% c("Kiel", "Kiel Without Temperature")]

pairwise_tests_summary_kiel[, p.signif := as.character(stats::symnum(x = p.adjust(p, method = "BH"), 
                                       cutpoints = c(0, 1e-04, 0.001, 0.01, 0.05, 1), 
                                       symbols = c("****", "***", "**", "*", "ns"))), 
                       by = .(Site, metric_name)]



pairwise_tests_summary_kiel[score_summary_metrics_kiel[!is.na(bootstrap_ind), 
                                     .(y.position = max(metric)), 
                                     by = .(Site, metric_name)],
               on = .NATURAL,
               y.position := y.position]
pairwise_tests_summary_kiel[, y.position := y.position + y.position * (.05 * (seq.int(.N) - 1)), by = .(Site, metric_name)]

score_summary_kiel <- ggplot(score_summary_metrics_kiel[!is.na(bootstrap_ind)],
    aes(x = tidytext::reorder_within(model, metric, Site), y = metric)) +
    geom_col(data = score_summary_metrics_kiel[is.na(bootstrap_ind)], aes(fill = `Model Type`), color = 'white', alpha = 0.7) +
    stat_summary(fun.data = function(x) list(y = mean(x), ymin = quantile(x, 0 + ((1 - .9)/2)), ymax = quantile(x, 1 - ((1 - .9)/2))), 
                 geom = "pointrange", fatten = 2) +
    stat_pvalue_manual(pairwise_tests_summary_kiel, label = "p.signif", tip.length = 0, size = 2) +
    scale_fill_manual(values = model_type_colors) +
    # add the value as text into the bar, ninety degrees rotated
    geom_text(data = score_summary_metrics_kiel[is.na(bootstrap_ind)], aes(label = round(metric, rounding_by[metric_name]), y = 0), hjust = -.15, angle = 90) +
    facet_grid(metric_name ~ Site, scales = "free") +
    labs(x = NULL, y = NULL) + 
    tidytext::scale_x_reordered() + 
    theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
ggsave(filename = file.path(fig_dir, "score_summary_kiel.pdf"), plot = score_summary_kiel, width = 4, height = 4)

Prediction and Error Plots

plot_preds <- function(preds, metric_dt, main_metric, facet_col = "Site", facet_row = "model", color_col = "Model Type", focus_observed = FALSE){
  p <- ggplot(preds, aes(x = truth, y = estimate, color = get(color_col))) +
    geom_abline(color = "gray50", lty = 2) + 
    geom_point(alpha = 0.5) + 
    labs(title = NULL, x = 'Observed RMR in kJ/day', y = 'Predicted RMR in kJ/day') +
    theme(legend.position = "none") +
    geom_label(data = metric_dt, aes(x = -Inf, y = Inf, label = metrics_string), 
               vjust = 1.1, hjust = -0.1, inherit.aes = FALSE, size = 3)
  if (preds[, !is.numeric(get(color_col))])
    p <- p + scale_color_manual(values = model_type_colors)
  else 
    p <- p + scale_color_gradient2()
  if (facet_col != "" & facet_row != "") {
    p <- p + facet_grid(get(facet_col) ~ get(facet_row))
  }
  if (focus_observed) {
    p <- p +
      coord_obs_pred(xlim=c(min(preds$truth), max(preds$truth)),
                     ylim=c(min(preds$truth), max(preds$truth)))
  } else {
    p <- p +
      coord_obs_pred()
  }
  p
}

plot_bland_altman <- function(preds, metric_dt, main_metric, facet_col = "Site", facet_row = "model", color_col = "Model Type") {
  preds[, mean_measurement := (truth + estimate) / 2]
  preds[, difference := truth - estimate]
  if (facet_col != "" & facet_row != "") {
    preds[, mean_difference := mean(difference), by=.(get(facet_row), get(facet_col))]
    preds[, lower_difference:= mean_difference - sd(difference)*1.96, by=.(get(facet_row), get(facet_col))]
    preds[, upper_difference:= mean_difference + sd(difference)*1.96, by=.(get(facet_row), get(facet_col))]
  } else {
    preds[, mean_difference := mean(difference)]
    preds[, lower_difference:= mean_difference - sd(difference)*1.96]
    preds[, upper_difference:= mean_difference + sd(difference)*1.96]
  }
  
  p <- ggplot(preds, aes(x = mean_measurement, y = difference, color = get(color_col))) +
    geom_hline(aes(yintercept = mean_difference)) +
    geom_hline(aes(yintercept = lower_difference), linetype = 'dashed') +
    geom_hline(aes(yintercept = upper_difference), linetype = 'dashed') +
    geom_point(alpha = 0.5) + 
    labs(title = NULL, x = 'Mean of Observed and Predicted', y = 'Observed - Predicted') +
    theme(legend.position = "none")
  
  if (preds[, !is.numeric(get(color_col))])
    p <- p + scale_color_manual(values = model_type_colors)
  
  if (facet_col != "" & facet_row != "") {
    p <- p + facet_grid(get(facet_col) ~ get(facet_row))
  }
  p
}
sites <- c("Freising", "Nuremberg", "Kiel", "KielWoTemp")
models <- c(interesting_modelmap[startsWith(interesting_modelmap, "Lasso")],
            established_models)
pred_dt[, `BMI Category` := cut(BMI, breaks = c(0, 16, 17, 18.5, 25, 30, 35, 40, Inf), right = FALSE)]
pred_dt[, `BMI Category` := factor(`BMI Category`, levels = c("[0,16)", "[16,17)", "[17,18.5)", "[18.5,25)", "[25,30)", "[30,35)", "[35,40)", "[40,Inf)"))]
bmi_mapping <- c(`[0,16)` = "Underweight (Severe thinness)",
                 `[16,17)` = "Underweight (Moderate thinness)",
                 `[17,18.5)` = "Underweight (Mild thinness)",
                 `[18.5,25)` = "Normal range",
                 `[25,30)` = "Overweight (Pre-obese)",
                 `[30,35)` = "Obese (Class I)",
                 `[35,40)` = "Obese (Class II)",
                 `[40,Inf)` = "Obese (Class III)")
pred_dt[, `BMI Category` := bmi_mapping[`BMI Category`]]
pred_dt[, `BMI Category` := ordered(`BMI Category`, levels = bmi_mapping)]

bmi_colors <- c("#7c7cbd", "#7c7cfd", "#7cfcfc", "#7cfc7c", "#fcfc7c", "#fcbb91", "#fc9192", "#c08081")
names(bmi_colors) <- bmi_mapping

metric_string_dt <- data.table::dcast(metric_dt[is.na(bootstrap_ind)], model + Site ~ .metric, value.var = '.estimate')[, metrics_string := do.call(
    paste,
    c(Map(function(name, value) paste(name, round(value, rounding_by[name]), sep = ": "), interesting_metricmap, mget(interesting_metricmap)), sep = "\n ")
)]

pred_obs <-
        plot_bland_altman(
          pred_dt[model %in% models &
                    Site %in% sites],
          metric_string_dt[model %in% models &
                           Site %in% sites],
          main_metric = interesting_metricmap[main_metric],
          facet_col = "Site",
          facet_row = "model",
          color_col = "BMI Category"
        )
# print(pred_obs + 
#         geom_point(alpha = 1, size = .5) +
#         guides(color = guide_legend(byrow = T)) +
#         scale_color_manual(values = bmi_colors) +
#         theme(legend.position = "top") + labs(color = "BMI Category"))
print(pred_obs + 
        geom_point(alpha = 1, size = .4) +
        guides(color = guide_legend(byrow = T)) +
        scale_color_brewer(palette = "RdYlBu", direction = -1) +
        theme(legend.position = "top") + labs(color = "BMI Category"))
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.

# print(pred_obs + 
#         geom_point(alpha = 1, size = .5) +
#         guides(color = guide_legend(byrow = T)) +
#         scale_color_brewer(type = "div", direction = -1) +
#         theme(legend.position = "top") + labs(color = "BMI Category"))
library(patchwork)
pred_plots <-
  mapply(
    sites = list(c("Freising", "Nuremberg", "Kiel", "KielWoTemp"),
                 c("Freising")),
    models = list(
      c(interesting_modelmap[startsWith(interesting_modelmap, "Lasso")],
        established_models
    ), 
      c("Lasso Enable")),
    fig.sizes = list(c(15, 15), 
                     c(4, 4)),
    FUN = function(sites, models, fig.sizes) {
      fig_suffix <- paste0(paste(sites, collapse="-"), "_", paste(models, collapse="-"), ".pdf")
      pred_obs <-
        plot_preds(
          pred_dt[model %in% models &
                    Site %in% sites],
          metric_string_dt[model %in% models &
                           Site %in% sites],
          main_metric = interesting_metricmap[main_metric],
          facet_col = ifelse(length(sites) > 1, "Site", ""),
          facet_row = ifelse(length(models) > 1, "model", ""),
          focus_observed = TRUE
        )
      print(pred_obs)
      ggsave(filename = file.path(fig_dir, paste0("pred_obs_", fig_suffix)), plot = pred_obs, width = fig.sizes[1], height = fig.sizes[2])
      ba <-
        plot_bland_altman(
          pred_dt[model %in% models &
                    Site %in% sites],
          metric_string_dt[model %in% models &
                           Site %in% sites],
          main_metric = interesting_metricmap[main_metric],
          facet_col = ifelse(length(sites) > 1, "Site", ""),
          facet_row = ifelse(length(models) > 1, "model", ""),
        )
      print(ba)
      ggsave(filename = file.path(fig_dir, paste0("ba_", fig_suffix)), plot = ba, width = fig.sizes[1], height = fig.sizes[2])
      ggsave(filename = file.path(fig_dir, paste0("ba_pred_obs_", fig_suffix)), plot = (pred_obs + ba), width = fig.sizes[1]*2 - 1, height = fig.sizes[2])
      list(
        pred_obs = pred_obs,
        ba = ba
      )      
    },
    SIMPLIFY = FALSE
  )

pred_obs_freising <- pred_plots[[2]]$pred_obs
pred_obs_score_summary <- ggarrange(pred_obs_freising, score_summary, widths = c(4,8), labels = c("A", "B"))
pred_obs_score_summary

ggsave(file = file.path(fig_dir, "pred_obs_score_summary.pdf"), plot = pred_obs_score_summary, width = 12, height = 6)

Explained Variance Selected Features on Freising

require(ggrepel)
## Loading required package: ggrepel
selected_lm_fits <- sapply(new_models,
       function(model) {
         lm_fit <- model |> extract_fit_engine()
         coef_mat <- lm_fit |> summary() |> coefficients()
         # coef_mat[coef_mat[, 'Pr(>|t|)'] < 0.05, , drop = FALSE] |> print()
         lm_fit
       }, simplify = FALSE)

feature_imps <- c("lmg", "pmvd")
linear_imp <- relaimpo::calc.relimp(selected_lm_fits$selected_linear, type = feature_imps)
expl_variance <- data.table::setDT(enframe(c(linear_imp$pmvd, `Intra-Individual Variation` = 0.043, Unexplained = 0), 'Feature', 'Explained Variance'))
expl_variance[Feature == 'Unexplained', `Explained Variance` := 1 - sum(expl_variance$`Explained Variance`)]
expl_variance[, Feature:=factor(Feature, levels=c(names(sort(linear_imp$pmvd, decreasing = TRUE)), 'Unexplained', 'Intra-Individual Variation'))]
data.table::setkey(expl_variance, Feature)
expl_variance[, cum_var := cumsum(`Explained Variance`)]
expl_variance[, x_space := cum_var - `Explained Variance`/2]

expl_variance_plot <- ggplot(expl_variance, aes(x = `Explained Variance`, y = 'Feature', fill = reorder(Feature, -x_space))) + 
    geom_col(position = 'fill') + 
    scale_fill_manual(values = feature_colors, guide = guide_legend(reverse = TRUE, nrow = 1)) + 
    theme(axis.text.y = element_blank(), axis.ticks.y = element_blank(), legend.position = 'top') + 
    ggrepel::geom_label_repel(aes(label = round(`Explained Variance` * 100, 1),
                                  x = x_space),
                              max.overlaps = 10,
                              min.segment.length = 0,
                              force = 10,
                              direction = 'y',
                              show.legend = FALSE,
                              color = text_colors[expl_variance[, levels(Feature)]],
                              segment.color = text_colors[expl_variance[, levels(Feature)]]) + 
    labs(y = NULL, fill = NULL)
ggsave(filename = file.path(fig_dir, "selected_features_expl_variance.pdf"), plot = expl_variance_plot, width = 8.5, height = 3)

Data Plots

scatter <- ggscatterhist(all_data, x = "Mean Outdoor Temperature", y = "RMR", color = "Site", xlab = "Mean Daily Temperature in °C", ylab = "RMR in kJ", alpha=.5, print = FALSE, legend = "bottom")
scatter$sp <- scatter$sp + geom_smooth(method = 'lm', se = FALSE, aes(color = Site)) + stat_cor(size = 3, aes(color = Site))
pdf(file.path(fig_dir, "scatter_temp_rmr.pdf"), width = 7, height = 7)
scatter
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
dev.off()
## png 
##   2
print(scatter)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

scatter_kiel_scores <- ggpubr:::.insert_xaxis_grob(scatter$sp, scatter$xplot, grid::unit(1/5, 
                                                "null"), position = "top")
## `geom_smooth()` using formula = 'y ~ x'
scatter_kiel_scores <- ggpubr:::.insert_yaxis_grob(scatter_kiel_scores, scatter$yplot, grid::unit(1/5, 
                                                 "null"), position = "right")
scatter_kiel_scores <- ggarrange(as_ggplot(scatter_kiel_scores), score_summary_kiel, labels = c("A", "B"))
ggsave(filename = file.path(fig_dir, "scatter_temp_rmr_kiel_scores.pdf"), plot = scatter_kiel_scores, width = 14, height = 7)
library(pheatmap)
library(seriation)
cor_method <- "pearson"
cor_mat <-
  general_recipe |> step_select(c(all_predictors(), all_outcomes())) |> prep() |> bake(NULL) |> cor(use = "pairwise", method = cor_method)
colnames(cor_mat)[colnames(cor_mat) == "Sex_weiblich"] <- "Sex (f)"
rownames(cor_mat)[rownames(cor_mat) == "Sex_weiblich"] <- "Sex (f)"
microbiome_cor_mat <- round(cor_mat[column_groups[group %in% c("microbiome family", "microbiome diversity"), column], "RMR", drop = FALSE], 3)
microbiome_cor_mat <- microbiome_cor_mat[order(microbiome_cor_mat[, "RMR"], decreasing = TRUE), , drop=FALSE]
colnames(microbiome_cor_mat)[colnames(microbiome_cor_mat) == "RMR"] <- "RMR (kJ/day)"
write.table(microbiome_cor_mat, quote = FALSE, sep = ',')
## RMR (kJ/day)
## f__Sutterellaceae,0.16
## f__Prevotellaceae,0.135
## f__unknown_ Bacteroidetes,0.129
## f__Succinivibrionaceae,0.111
## f__Acidaminococcaceae,0.107
## f__Coriobacteriaceae,0.103
## f__Erysipelotrichaceae,0.101
## f__Enterococcaceae,0.073
## f__Veillonellaceae,0.066
## f__Lachnospiraceae,0.052
## f__Pasteurellaceae,0.037
## f__Clostridiaceae 1,0.035
## f__Comamonadaceae,0.035
## f__Rhodospirillaceae,0.033
## f__Enterobacteriaceae,0.032
## f__Fusobacteriaceae,0.017
## f__Bifidobacteriaceae,0.014
## f__Peptococcaceae 1,0.011
## f__Eubacteriaceae,-0.003
## f__Peptostreptococcaceae,-0.01
## f__Spirochaetaceae,-0.011
## f__Lactobacillaceae,-0.022
## f__unknown_ Desulfovibrionales,-0.032
## f__Methanobacteriaceae,-0.036
## f__unknown_ Rhodospirillales,-0.036
## f__Puniceicoccaceae,-0.038
## f__Bacteroidaceae,-0.042
## f__unknown_ Proteobacteria,-0.057
## f__Streptococcaceae,-0.061
## f__unknown_ Alphaproteobacteria,-0.064
## f__Victivallaceae,-0.07
## f__unknown_ Burkholderiales,-0.072
## f__unknown_ Clostridia,-0.072
## f__Elusimicrobiaceae,-0.078
## f__Desulfovibrionaceae,-0.082
## f__Verrucomicrobiaceae,-0.097
## f__unknown_ Bacteroidales,-0.1
## f__Anaeroplasmataceae,-0.104
## f__unknown_ Deltaproteobacteria,-0.104
## f__unknown_ Bacteria,-0.122
## Simpson.effective,-0.128
## f__unknown_ Firmicutes,-0.136
## Shannon.effective,-0.147
## f__unknown_ Clostridiales,-0.156
## f__Rikenellaceae,-0.174
## f__Ruminococcaceae,-0.188
## f__Porphyromonadaceae,-0.219
non_microbiome_cor_mat <- round(cor_mat[!rownames(cor_mat) %in% rownames(microbiome_cor_mat), "RMR", drop = FALSE], 3)
non_microbiome_cor_mat <- non_microbiome_cor_mat[order(non_microbiome_cor_mat[, "RMR"], decreasing = TRUE), , drop=FALSE]
colnames(non_microbiome_cor_mat)[colnames(non_microbiome_cor_mat) == "RMR"] <- "RMR (kJ/day)"
write.table(non_microbiome_cor_mat, quote = FALSE, sep = ',')
## RMR (kJ/day)
## RMR,1
## FFM,0.853
## Weight,0.722
## Height,0.706
## Hemoglobin,0.538
## Waist,0.493
## Hematocrit,0.484
## Uric Acid,0.471
## Visceral Fat,0.456
## MCHC,0.422
## fT3,0.404
## Hip,0.349
## GPT ALT,0.343
## Creatinine,0.334
## Ferritin,0.281
## Triglycerides,0.279
## GGT,0.251
## BP Diastolic,0.247
## Iron,0.204
## GFR,0.203
## BP Systolic,0.199
## propionic acid a,0.187
## Glucose,0.185
## FM,0.182
## GOT AST,0.171
## Insulin,0.159
## butyric acid a,0.12
## Urea,0.099
## acetic acid a,0.074
## Leukocytes,0.059
## Body Temperature,0.051
## TSH,0.043
## Heart Rate,0.035
## Isobutyrate a,0.022
## Isovalerate a,0.002
## LDH,-0.002
## 2-Methylbutyrate a,-0.005
## pentanoic acid a,-0.009
## Lactic acid a,-0.019
## 4-Methylvaleric acid a,-0.028
## LDL,-0.038
## hexanoic acid c,-0.039
## hsCRP,-0.058
## Mean Outdoor Temperature,-0.147
## Age,-0.184
## Cholesterol,-0.199
## HDL,-0.49
## Sex (f),-0.658
dist_obj = as.dist(1 - cor_mat)
clust <- hclust(dist_obj, method = "complete") # do not use "ward.D2" here, because we use correlation distance
clust <-
  seriation:::reorder.hclust(clust, dist = dist_obj, method = "OLO")
data.table::setkey(column_groups, column)
anno_df <- as.data.frame(column_groups[, "group", with = FALSE], row.names = column_groups[, column])
rownames(anno_df)[rownames(anno_df) == "Sex"] <- "Sex (f)"
anno_df$group <- tools::toTitleCase(as.character(anno_df$group))
my_heatmap <- pheatmap(
  cor_mat,
  cluster_rows = clust,
  treeheight_col = 0,
  cluster_cols = clust,
  annotation_row = anno_df[clust$labels[clust$order], , drop = FALSE],
  # annotation_row = anno_df[clust$labels[clust$order], , drop = FALSE],
  color = colorRampPalette(rev(
    RColorBrewer::brewer.pal(n = 7, name = "RdBu")
  ))(100), 
  silent = TRUE
)
pdf(file.path(fig_dir, paste0(
  "correlation_heatmap_", cor_method, ".pdf"
)), width = 19, height = 14)
grid::grid.newpage()
grid::grid.draw(my_heatmap$gtable)
dev.off()
## png 
##   2
grid::grid.newpage()
grid::grid.draw(my_heatmap$gtable)